Client Report - Can You Predict That?

Unit 4 Task 1

Author

Ezekial Curran

Show the code
# import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import polars as pl
import numpy as np
from lets_plot import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.inspection import permutation_importance
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.metrics import (
  classification_report,
  accuracy_score,
  recall_score,
  precision_score,
  f1_score,
  r2_score,
  mean_absolute_error,
  mean_squared_error
)

LetsPlot.setup_html(isolated_frame=True)
Show the code
# import your data here using pandas and the URL
url1 = "https://github.com/byuidatascience/data4dwellings/raw/master/data-raw/dwellings_ml/dwellings_ml.csv"
url2 = "https://github.com/byuidatascience/data4dwellings/raw/master/data-raw/dwellings_neighborhoods_ml/dwellings_neighborhoods_ml.csv"
url3 = "https://github.com/byuidatascience/data4dwellings/raw/master/data-raw/dwellings_denver/dwellings_denver.csv"
url4 = "https://github.com/byuidatascience/data4dwellings/blob/master/data.md"

df = pl.read_csv('dwellings_ml.csv')

QUESTION

Build a classification model labeling houses as being built “before 1980” or “during or after 1980”. Your goal is to reach or exceed 90% accuracy. Report your final model choice and any other model parameters you may have tweaked (train-test split ratio, tuning parameters, etc).

The model chosen was the Gradient Boost Classifier. Most parameters were not changed beyond a test_size of 0.2 and a random state of 117.

Show the code
# Include and execute your code here
X = df.drop(["parcel", "before1980", "yrbuilt"])
y = df.select(["before1980"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=117)

model_xgb = GradientBoostingClassifier()
model_xgb.fit(X_train, y_train)
GradientBoostingClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Show the code
pred = model_xgb.predict(X_test)
print(classification_report(y_test, pred))
              precision    recall  f1-score   support

           0       0.88      0.84      0.86      1756
           1       0.91      0.93      0.92      2827

    accuracy                           0.90      4583
   macro avg       0.89      0.89      0.89      4583
weighted avg       0.90      0.90      0.90      4583

Extra

Use a neural network to classify. Make sure you have the right packages, tools, drivers, and CUDA

Use this site for more clarity: https://pytorch.org/get-started/locally/

(to check your CUDA version)

nvcc --version

Command used if CUDA version is 12.6

pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
Show the code
# Check whether CUDA is available
if (torch.cuda.is_available()):
  print(f"CUDA is available, using the GPU")
  print(torch.cuda.get_device_name(0))       # GPU name
  # print(torch.cuda.current_device()) 
  device = torch.device("cuda")
else:
  print(f"CUDA is not available, using the CPU")
  device = torch.device("cpu")

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")
CUDA is available, using the GPU
NVIDIA GeForce RTX 4060 Laptop GPU
Show the code
# Converting to numpy for PyTorch
X_nn = X.to_numpy()
y_nn = y.to_numpy()

scaler = StandardScaler()
X_nn = scaler.fit_transform(X_nn)

X_nn_trn, X_nn_tst, y_nn_trn, y_nn_tst = train_test_split(X_nn, y_nn, test_size=0.2, random_state=117)

# Converting to PyTorch Tensors
X_trn_tensor = torch.tensor(X_nn_trn, dtype=torch.float32)
y_trn_tensor = torch.tensor(y_nn_trn, dtype=torch.float32).view(-1, 1)
X_tst_tensor = torch.tensor(X_nn_tst, dtype=torch.float32)
y_tst_tensor = torch.tensor(y_nn_tst, dtype=torch.float32).view(-1, 1)

if torch.cuda.is_available():
  print(f"Moving Tensors to the GPU")
  X_trn_tensor = X_trn_tensor.to(device)
  y_trn_tensor = y_trn_tensor.to(device)
  X_tst_tensor = X_tst_tensor.to(device)
  y_tst_tensor = y_tst_tensor.to(device)

# You must use LongTensors when doing multi classification with CrossEntropyLoss()
# ie..., Floats and Strings are not allowed here
# X_trn_tensor = torch.tensor(X_nn_trn, dtype=torch.long)
# If dealing with strings convert them to an index ie...
# class_names = ["cat", "dog", "bird"]
# labels = ["dog", "cat", "bird", "cat"]
# class_to_index = {name: idx for idx, name in enumerate(class_names)}
# indices = [class_to_index[label] for label in labels]
# y_trn_tensor = torch.tensor(indices, dtype=torch.long)
Moving Tensors to the GPU
Show the code
# Defining a simple nueral network
# Inherit from the nn.Module from PyTorch
class Net(nn.Module):
  def __init__(self, input_size):
    super(Net, self).__init__() # Initializes the parent class so all of its internal machinery is set up
    self.fc1 = nn.Linear(input_size, 16) # First fully connected layer, maps the input to 16 neurons
    self.relu = nn.ReLU() # Desigend to introduce non-linearity, it also replaces negative vlaues with 0, can help the network learn complex patterns
    self.fc2 = nn.Linear(16,1) # This is the second and final fully connected layer mapping the previous layer from 16 neurons back down to 1 which is a typical output for regression tasks or binary classification logits

    # use this final layer instead of the previous for multi classification and add num_classes as an input parameter to the __init__ constructor
    # self.fc2 = nn.Linear(16, num_classes)
  
  # This defines how the data will flow through the network, x is the input tensor
  def forward(self, x):
    # Simply passing the input through each layer and activation functions (relu)
    x = self.fc1(x)
    x = self.relu(x)
    x = self.fc2(x)
    return x

# Adding a classifier for measuring importance
# class TorchClassifier(BaseEstimator, ClassifierMixin):
#   def __init__(self, model, device):
#     self.model = model
#     self.device = device

#   def fit(self, X, y):
#     return self

#   def predict(self, X):
#     self.model.eval()
#     with torch.no_grad():
#       X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device)
#       logits = self.model(X_tensor)
#       if logits.shape[1] == 1:
#         probs = torch.sigmoid(logits).cpu().numpy()
#         return (probs >= 0.5).astype(int).flatten()
#       else:
#         return torch.argmax(logits, dim=1).cpu().numpy()

class TorchModelWrapper(BaseEstimator):
  def __init__(self, model, device, task="classification", scorer="f1_score"):
    self.model = model
    self.device = device
    self.task = task # 'classification' or 'regression'
    self.scorer = scorer

  def fit(self, X, y):
    return self

  def predict(self, X):
    self.model.eval()
    with torch.no_grad():
      X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device)
      outputs = self.model(X_tensor)

      if self.task == "regression":
        return outputs.cpu().numpy().flatten()
      elif self.task == "classification":
        if outputs.shape[1] == 1:
          probs = torch.sigmoid(outputs).cpu().numpy()
          return (probs >= 0.5).astype(int).flatten()
        else:
          return torch.argmax(outputs, dim=1).cpu().numpy()
      else:
        raise ValueError(f"Unsupported task type: {self.task}\nOnly two types of tasks are available: regression and classification (default)")

  def score(self, X, y):
    y_pred = self.predict(X)

    if self.scorer == "f1_score":
      return f1_score(y, y_pred)
    elif self.scorer == "accuracy_score":
      return accuracy_score(y, y_pred)
    elif self.scorer == "MSE":
      return mean_squared_error(y, y_pred)
    elif self.scorer == "MAE":
      return mean_absolute_error(y, y_pred)
    elif self.scorer == "r2_score":
      return r2_score(y, y_pred)
    elif self.scorer == "recall_score":
      return recall_score(y, y_pred)
    elif self.scorer == "precision_score":
      return precision_score(y, y_pred)
    else:
      raise ValueError(f"Unsupported scoring method: {self.scorer}")
Show the code
model_nn = Net(input_size=X_nn.shape[1])

if torch.cuda.is_available():
  print(f"Moving the model to the GPU")
  model_nn = model_nn.to(device)

# Use this for regression -- Other loss functions can be used instead of MSE
# criterion = nn.MSELoss()

# Use this for binary classification
criterion = nn.BCEWithLogitsLoss()

# Use this for multi classification
# criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model_nn.parameters(), lr=0.01)
Moving the model to the GPU
Show the code
# Training loop
losses = []
loss_epochs =[]

epochs = 15000
for epoch in range(epochs):
  model_nn.train()
  outputs = model_nn(X_trn_tensor)
  loss = criterion(outputs, y_trn_tensor)

  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

  losses.append(loss.item())
  loss_epochs.append(epoch)

  if (epoch+1) % (epochs / 10) == 0:
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

loss_track = pl.DataFrame({
  "epoch": loss_epochs,
  "loss": losses
})
Epoch 1500/15000, Loss: 0.1933
Epoch 3000/15000, Loss: 0.1869
Epoch 4500/15000, Loss: 0.1853
Epoch 6000/15000, Loss: 0.1842
Epoch 7500/15000, Loss: 0.1838
Epoch 9000/15000, Loss: 0.1831
Epoch 10500/15000, Loss: 0.1834
Epoch 12000/15000, Loss: 0.1828
Epoch 13500/15000, Loss: 0.1828
Epoch 15000/15000, Loss: 0.1827
Show the code
loss_graph = (
    ggplot(data=loss_track)
    + geom_line(mapping = aes(x = 'epoch', y = 'loss', color='loss'))
    + guides(color="none")
    + scale_color_manual(values=['blue'])
    + labs(
      title="Loss Over Time",
      x="Epochs",
      y="Loss"
    )
    + theme(
        panel_background=element_rect(fill='gray'),
        plot_background=element_rect(fill='gray'),
        panel_grid_major=element_rect(fill='gray'),
        legend_background=element_rect(fill='gray'),
        axis_text=element_text(color='white'),
        axis_title=element_text(color='white'),
        plot_title=element_text(color='white'),
        plot_subtitle=element_text(color='white'),
        legend_text=element_text(color='white'),
        legend_title=element_text(color='white'),
        label_text=element_text(color='white')
    )
    + ggsize(1600, 900)
)
display(loss_graph)
Show the code
# Evaluating the model
if torch.cuda.is_available():
  # For classification (binary and multiple)
  wrapped_model = TorchModelWrapper(model=model_nn, device=device, task="classification")
  # For regression, the rest is the same for any of the tasks
  # wrapped_model = TorchModelWrapper(model=model_nn, device=device, task="regression")
  # Feature importance
  result = permutation_importance(
    wrapped_model,
    X_nn, y_nn,
    n_repeats=10,
    random_state=117,
    n_jobs=-1
  )

  importances = result.importances_mean
  stds = result.importances_std
  indices = np.argsort(importances)[::-1]

  nn_res = pl.DataFrame({"Feature": np.array(X.columns)[indices], "Importance": importances[indices], "Std": stds[indices]})
else:
  model_nn.eval()

# use this for regression
# with torch.no_grad():
#   preds = model_nn(X_tst_tensor)
#   test_loss = criterion(preds, y_tst_tensor)
#   print(f"Test Loss: {test_loss.item():.4f}")

# use this for binary classification
with torch.no_grad():
  probs = torch.sigmoid(model_nn(X_tst_tensor)) # Used to squash the output between 0 and 1 for binary classification by doing probability
  preds = (probs >=0.5).int().cpu().numpy() if torch.cuda.is_available() else (probs >= 0.5).float() # Sets the threshold at 0.5 for binary classification
  
  y_true = y_tst_tensor.int().cpu().numpy() if torch.cuda.is_available() else y_tst_tensor

# use this for multi classification
# with torch.no_grad():
#   outputs = model_nn(X_tst_tensor)
#   preds = torch.argmax(outputs, dim=1)

nn_res = pl.DataFrame({"Feature": np.array(X.columns)[indices], "Importance": importances[indices], "Std": stds[indices]})
Show the code
print(classification_report(y_true, preds))

if torch.cuda.is_available():
  top_feats = nn_res.sort("Importance").tail(10)
  feats_bar = (
    ggplot(data=top_feats)
        + geom_bar(mapping = aes(x = 'Importance', y = 'Feature', fill='Feature', color='Feature'), stat='identity')
        + guides(color="none")
        + labs(
          title="Feature Importance",
          subtitle="The top 10 most important features are shown that helped the model learn on the data.",
          x="Importance",
          y="Feature",
          fill='Feature'
        )
        + theme(
            panel_background=element_rect(fill='gray'),
            plot_background=element_rect(fill='gray'),
            panel_grid_major=element_rect(fill='gray'),
            legend_background=element_rect(fill='gray'),
            axis_text=element_text(color='white'),
            axis_title=element_text(color='white'),
            plot_title=element_text(color='white'),
            plot_subtitle=element_text(color='white'),
            legend_text=element_text(color='white'),
            legend_title=element_text(color='white'),
            label_text=element_text(color='white')
        )
        + ggsize(1600, 900)
  )

  display(feats_bar)
else:
  print("CUDA is not available!")
              precision    recall  f1-score   support

           0       0.90      0.86      0.88      1756
           1       0.92      0.94      0.93      2827

    accuracy                           0.91      4583
   macro avg       0.91      0.90      0.90      4583
weighted avg       0.91      0.91      0.91      4583